**06/28/2021
**file to bring in CPD FOIA arrests data received in 2021 & assign unique IDs
 
**Housekeepig
local ver_suff _9_21
clear all
local ver_suff _9_21
clear all
if "`c(username)'"=="taehokim" |"`c(username)'"=="Taeho Kim"{ 
	cd "/Users/`c(username)'/Dropbox (Penn Law)/misconduct/do_files`ver_suff'"
} 
else{
	cd "/Users/`c(username)'/Dropbox/misconduct/do_files`ver_suff'"
}
set more off

//later pass through profiles dataset to fuzzy match 
local profiles_match `1'

*******************
*BRING IN ARREST DATA
*******************

//run the following to read in arrests data
use ../processed_data`ver_suff'/arrests_officers/officers_all.dta, clear
 
 
**Read officer name
gen fname = trim(officer_first_name)
gen lname = trim(officer_last_name)
replace mi = trim(mi)

 
**Read officer appointment date
gen appoint_dt = appointed_date
gen appoint_yr = year(appoint_dt)
gen appoint_m = month(appoint_dt)
gen appoint_d = day(appoint_dt)


//some obs have variations of last names: deflate last names
gen lname_mod = subinstr(lname, "-", "", .)
replace lname_mod = subinstr(lname_mod, " ", "", .)
replace lname_mod = subinstr(lname_mod, ".", "", .)


**Save intermediate version of data
save ../processed_data`ver_suff'/FOIA_processing/arrests, replace

**Deduplicate by matching variables
duplicates drop fname lname_mod mi appoint_yr appoint_m appoint_d , force
keep fname lname lname_mod mi appoint_yr appoint_m appoint_d


**Generate preliminary id
sort fname lname_mod mi appoint_yr appoint_m appoint_d
gen pid = _n 

**Create soundex variables
gen fname_sdx = soundex(fname)
gen lname_sdx = soundex(lname)
 
**Save version of data with preliminary ID
save ../processed_data`ver_suff'/FOIA_processing/arrest_pid, replace
clear


*******************
*SET UP LIST OF DATA TO STORE MATCHED IDS
*******************
clear 

set obs 1
gen u_pid = -1
gen pid = -1

save ../processed_data`ver_suff'/FOIA_processing/pid_appending, replace
clear


*******************
*FUZZY MATCHING
*******************

use ../processed_data`ver_suff'/FOIA_processing/arrest_pid, clear // start with 24,389

**Perfect name, appoint date  match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 1 2 2 2 0 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 1 //19,111 so far


**Perfect name, appoint date, and birth year match, no middle initial
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 2 2 0 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 2  

**Perfect first name, soundex last name, appoint date, and   no middle initial
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 1 0 2 2 2 0 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 3

**Soundex both names, middle initial, and appoint date, match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 1 1 1 2 2 2 0 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 4  

**Soundex both names, no middle initial, and appoint date,  match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 1 1 0 2 2 2 0 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 5  

**first name, no middle initial, no last name and appoint date match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 0 0 2 2 2 0 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 6  

**first name, no middle initial, no last name and appoint date  match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 0 0 2 2 2 0 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 7

**first name, last name, off in appoint_yr 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 1 2 2 0 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 9  


**first name, last name, off in appoint_mo 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 1 2 0 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 10  


**first name, last name, off in appoint_day 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 2 1 0 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 11  


**first name, last name, no other match 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0  3 3 3 3 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 13  
 
**soundex first name, soundex last name, no other match 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 1 1 0  3 3 3 3 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 14  
   
 
 
*******************
*KEEP BEST MATCHES
******************* 

use ../processed_data`ver_suff'/FOIA_processing/pid_appending.dta, clear

//drop the less good matches if individuals in awards data are matched with more than one in profiles

sort pid match_no u_pid
quietly by pid:  gen dup = cond(_N==1,0,_n)
sort dup
tab dup

drop if dup>1 
drop dup 
sort pid match_no

tempfile best_matches
save `best_matches'

*******************
*MERGE DE-DUPLICATED ARREST DATA WITH MATCHINGS
******************* 

use ../processed_data`ver_suff'/FOIA_processing/arrest_pid, clear

merge 1:1 pid using `best_matches', keep(1 3) //  24,066 MATCHED AMONG 24,389

 

